{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Model Development"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import time\n",
    "\n",
    "# preprocessing\n",
    "from config import DATA_CSV, PIXEL_ARRAY, DTYPE, Y_LABEL\n",
    "from config import FEATURES, OUTPUT_FOLDER, FORMAT\n",
    "from config import FEATURES_ARRAY, FEATURES_ARRAY2, FEATURES_ARRAY3\n",
    "from preprocessing import to_arr\n",
    "from imblearn.under_sampling import RandomUnderSampler\n",
    "from imblearn.over_sampling import ADASYN, SMOTE, RandomOverSampler\n",
    "\n",
    "\n",
    "# pipeline\n",
    "from sklearn.pipeline import Pipeline\n",
    "from config import TRANSFORMATION_LIST, EXTRACTION_LIST\n",
    "from sklearn.preprocessing import FunctionTransformer\n",
    "\n",
    "from sklearn.preprocessing import MinMaxScaler\n",
    "from sklearn.model_selection import cross_val_score\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "# model\n",
    "from sklearn.naive_bayes import GaussianNB\n",
    "from sklearn.svm import SVC, LinearSVC, NuSVC\n",
    "from sklearn.linear_model import LogisticRegression, SGDClassifier\n",
    "from sklearn.neighbors import KNeighborsClassifier\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier  # , GradientBoostingClassifier\n",
    "import xgboost as xgb\n",
    "from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis\n",
    "from sklearn.neural_network import MLPClassifier\n",
    "\n",
    "from sklearn.metrics import accuracy_score, recall_score, precision_score, classification_report, roc_auc_score\n",
    "\n",
    "RANDOM_STATE = 0\n",
    "\n",
    "classifier = DecisionTreeClassifier(random_state=RANDOM_STATE)\n",
    "classifiers = [\n",
    "    ('SVC', SVC(random_state=RANDOM_STATE, gamma='scale')),\n",
    "    ('LOGR', LogisticRegression(random_state=RANDOM_STATE, solver='lbfgs', max_iter=100)),\n",
    "    ('CART', DecisionTreeClassifier(criterion='gini', random_state=RANDOM_STATE)),\n",
    "    ('RF', RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE)),\n",
    "    ('MLP', MLPClassifier(hidden_layer_sizes=(100, ), activation='relu', random_state=RANDOM_STATE))\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def time_it():\n",
    "    dif = (time.time() - t0) / 60\n",
    "    print(\"Used time: {:.2f} min    {:.2f}\".format(dif, time.time()))\n",
    "\n",
    "\n",
    "# Utility function to report best scores\n",
    "def report(results, n_top=3):\n",
    "    for i in range(1, n_top + 1):\n",
    "        candidates = np.flatnonzero(results['rank_test_score'] == i)\n",
    "        for candidate in candidates:\n",
    "            print(\"Model with rank: {0}\".format(i))\n",
    "            print(\"Mean validation score: {0:.3f} (std: {1:.3f})\".format(\n",
    "                results['mean_test_score'][candidate],\n",
    "                results['std_test_score'][candidate]))\n",
    "            print(\"Parameters: {0}\".format(results['params'][candidate]))\n",
    "            print(\"\")\n",
    "\n",
    "\n",
    "def save_result(all_results, filename):\n",
    "    compare = pd.DataFrame(all_results)\n",
    "    compare['feature'] = features\n",
    "    compare = compare.set_index('feature')\n",
    "    compare.to_pickle(filename + '.pkl')\n",
    "\n",
    "\n",
    "def save_model():\n",
    "    \"\"\"Model training\"\"\"\n",
    "    # Fit the model on training set\n",
    "    # model = RandomForestClassifier()\n",
    "    model = DecisionTreeClassifier(random_state=RANDOM_STATE)\n",
    "    model.fit(X_train, y_train)\n",
    "\n",
    "    y_pred = model.predict(X_test)\n",
    "\n",
    "    print(accuracy_score(y_test, y_pred))\n",
    "    print(recall_score(y_test, y_pred))\n",
    "    print(roc_auc_score(y_test, y_pred))\n",
    "    print(classification_report(y_test, y_pred))\n",
    "\n",
    "    \"Save the model\"\n",
    "    # model.fit(X, y)\n",
    "    # filename = 'model.sav'\n",
    "    # pickle.dump(model, open(filename, 'wb'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "transformer_pipe = Pipeline(steps=[\n",
    "    ('to_arr', FunctionTransformer(to_arr)),\n",
    "    ('minmax', MinMaxScaler()),\n",
    "])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load and split data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "y = np.load(Y_LABEL)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Comparing features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "features = FEATURES_ARRAY3\n",
    "scores = {}\n",
    "\n",
    "# for feature in [OUTPUT_FOLDER + 'lbp2' + FORMAT]:  # features:\n",
    "for feature in features:\n",
    "    print(\"\"\"\n",
    "    ----------------------------------\n",
    "    getting feature: {}\n",
    "    ----------------------------------\n",
    "    \"\"\".format(feature))\n",
    "\n",
    "    X = np.load(feature, allow_pickle=True)\n",
    "    X = transformer_pipe.fit_transform(X)\n",
    "\n",
    "    \"Resampling\"\n",
    "    # rus = RandomUnderSampler(random_state=RANDOM_STATE)\n",
    "    # X_res, y_res = rus.fit_resample(X, y)\n",
    "\n",
    "    # RandomOverSampler(random_state=RANDOM_STATE)\n",
    "    # ADASYN(random_state=RANDOM_STATE)\n",
    "    smote = SMOTE(random_state=RANDOM_STATE)\n",
    "    X_res, y_res = smote.fit_resample(X, y)\n",
    "\n",
    "    X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, stratify=y_res)\n",
    "\n",
    "    \"One model for all\"\n",
    "    model = SVC(random_state=RANDOM_STATE)\n",
    "    model.fit(X_train, y_train)\n",
    "    y_pred = model.predict(X_test)\n",
    "\n",
    "    score = accuracy_score(y_pred, y_test)\n",
    "    scores[feature] = score\n",
    "\n",
    "    \"Cross validate all models\"\n",
    "    # feature_acc = []\n",
    "    # feature_rec = []\n",
    "    # for name, classifier in classifiers:\n",
    "    #     pipe = Pipeline(steps=[\n",
    "    #         ('classifier', classifier)\n",
    "    #     ])\n",
    "    #     pipe.fit(X_train, y_train)\n",
    "    #\n",
    "    #     acc_score = cross_val_score(pipe, X_train, y_train, cv=10, scoring='accuracy')\n",
    "    #     acc_result = {name: acc_score}\n",
    "    #\n",
    "    #     rec_score = cross_val_score(pipe, X_train, y_train, cv=10, scoring='recall')\n",
    "    #     rec_result = {name: rec_score}\n",
    "    #\n",
    "    #     feature_acc.append(acc_result)\n",
    "    #     feature_rec.append(rec_result)\n",
    "    #     print(acc_score, rec_score)\n",
    "    #\n",
    "    # np.save(feature[:-4] + '_acc.npy', feature_acc)\n",
    "    # np.save(feature[:-4] + '_rec.npy', feature_rec)\n",
    "    np.save('svm2', scores)\n",
    "\n",
    "    time_it()\n",
    "    t0 = time.time()\n",
    "\n",
    "np.save('svm3', scores)\n",
    "time_it()\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
